Chapter 3 Data statistics

load("data/data.Rdata")

3.1 Sequencing reads statistics

sample_metadata %>% 
    summarise(Total=sum(reads_post_fastp * 150 / 1000000000) %>% round(2), 
              mean=mean(reads_post_fastp * 150 / 1000000000) %>% round(2),
              sd=sd(reads_post_fastp * 150 / 1000000000) %>% round(2)) %>%
    unite("Average",mean, sd, sep = " ± ", remove = TRUE) %>%
    tt()
tinytable_xzd8razg2qifcj19wy2j
Total Average
154.38 4.98 ± 1.49

3.2 DNA fractions

sequence_fractions <- read_counts %>%
  pivot_longer(-genome, names_to = "sample", values_to = "value") %>%
  group_by(sample) %>%
  summarise(mags = sum(value)) %>%
    left_join(sample_metadata, by = join_by(sample == sample)) %>%
    select(sample,mags,metagenomic_bases,host_bases,bases_lost_fastp_percent) %>%
    mutate(mags_bases = mags*146) %>%
    mutate(lowqual_bases = ((metagenomic_bases+host_bases)/(1-bases_lost_fastp_percent))-(metagenomic_bases+host_bases)) %>%
    mutate(unmapped_bases = metagenomic_bases - mags_bases) %>%
    mutate(unmapped_bases = ifelse(unmapped_bases < 0, 0, unmapped_bases)) %>%
    select(sample, lowqual_bases, host_bases, unmapped_bases, mags_bases)

sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  tt()
tinytable_az2ci2lzkz7irirwda3i
Sample Low quality Mapped to host Unmapped Mapped to MAGs
EHI01966 0.3568965 0.005030679 1.9178067 2.507921
EHI01967 0.2692620 0.000709418 0.8339877 4.617569
EHI01968 0.3467658 0.000167184 1.2259195 5.669136
EHI01970 0.2568282 0.000235551 1.1356328 3.939774
EHI01979 0.3068304 0.000429188 1.1643291 3.963429
EHI01982 0.2321305 0.003106822 1.2765585 2.120459
EHI01985 0.3350121 0.000294977 1.3705302 4.091743
EHI01989 0.3249292 0.000570792 1.2444290 3.406020
EHI01992 0.2112149 0.000650860 1.3810604 2.863645
EHI01995 0.2441431 0.001014484 1.0559948 2.625781
EHI02065 0.2881934 0.000248787 0.9495023 3.524029
EHI02088 0.5380240 0.002508850 1.7069668 4.880649
EHI02582 0.1565447 0.000041661 0.7099037 3.061117
EHI02584 0.2131393 0.000243149 1.3719978 3.909083
EHI02585 0.3055922 0.000693419 1.3837783 4.156742
EHI02587 0.2002879 0.001053113 1.1718219 3.773711
EHI02592 0.2204754 0.000807086 1.0309674 3.998601
EHI02598 0.1922634 0.000085434 1.1517125 3.984019
EHI02602 0.1798841 0.000148220 1.2512663 2.802077
EHI02603 0.2165665 0.002208379 1.3526444 3.734602
EHI02607 0.2170496 0.000239951 1.5274046 3.978722
EHI02612 0.1376030 0.000170810 0.6921896 2.540514
EHI02615 0.1893992 0.000351599 0.6650246 1.618650
EHI02617 0.1957782 0.000243466 1.3981847 2.879747
EHI02619 0.2292718 0.000086283 0.9636172 3.865591
EHI02624 0.2077297 0.000351284 1.0437963 3.443889
EHI02625 0.1169758 0.000023503 0.5396744 1.996437
EHI02630 0.1994379 0.000258866 0.9279985 3.240507
EHI02632 0.3725667 0.000748839 1.4198031 2.929934
EHI02633 0.2780827 0.000903128 1.3810886 5.477885
EHI02639 0.4685676 0.001126284 2.1791314 8.022893
sequence_fractions %>%
  mutate_at(vars(-sample), ~./1000000000) %>%
  rename("Sample"=1, "Low quality"=2, "Mapped to host"=3, "Unmapped"=4, "Mapped to MAGs"=5) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  tt()
tinytable_uqhvwhdhwxudu5oeh0er
Low quality Mapped to host Unmapped Mapped to MAGs
0.2583047 0.0007984537 1.207249 3.665319
sequence_fractions %>%
    pivot_longer(!sample, names_to = "fraction", values_to = "value") %>%
    mutate(value = value / 1000000000) %>%
    mutate(fraction = factor(fraction, levels = c("lowqual_bases","host_bases","unmapped_bases","mags_bases"))) %>%
  
    ggplot(., aes(x = sample, y = value, fill=fraction)) +
        geom_bar(position="stack", stat = "identity") +
      scale_fill_manual(name="Sequence type",
                    breaks=c("lowqual_bases","host_bases","unmapped_bases","mags_bases"),
                    labels=c("Low quality","Mapped to host","Unmapped","Mapped to MAGs"),
                    values=c("#CCCCCC", "#bcdee1", "#d8b8a3","#93655c"))+
        labs(x = "Samples", y = "Amount of data (GB)") +
        theme_classic() +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "bottom")

3.3 Recovered microbial fraction

singlem_table <- sequence_fractions %>%
    mutate(mags_proportion = round((mags_bases / (mags_bases + unmapped_bases))*100,2)) %>%
    left_join(sample_metadata, by = join_by(sample == sample))  %>%
    mutate(singlem_proportion = round(singlem_fraction*100,2)) %>%
    select(sample,mags_proportion,singlem_proportion) %>%
    mutate(mags_proportion = ifelse(singlem_proportion == 0, 0, mags_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion == 0, NA, singlem_proportion)) %>% #convert zeros to NA
    mutate(singlem_proportion = ifelse(singlem_proportion < mags_proportion, NA, singlem_proportion)) %>% #if singlem is smaller, then NA, to simplify plot
    mutate(singlem_proportion = ifelse(singlem_proportion > 100, 100, singlem_proportion)) #simplify

singlem_table %>%
    pivot_longer(!sample, names_to = "proportion", values_to = "value") %>%
    mutate(proportion = factor(proportion, levels = c("mags_proportion","singlem_proportion"))) %>%
    ggplot(., aes(x = value, y = sample, color=proportion)) +
            geom_line(aes(group = sample), color = "#f8a538") +
            geom_point() +
      scale_color_manual(name="Proportion",
                    breaks=c("mags_proportion","singlem_proportion"),
                    labels=c("Recovered","Estimated"),
                    values=c("#52e1e8", "#876b53"))+
            theme_classic() +
            labs(y = "Samples", x = "Prokaryotic fraction (%)") +
        theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, size=6),legend.position = "right")

damr <- singlem_table %>%
  mutate(damr=ifelse(is.na(singlem_proportion),100,mags_proportion/singlem_proportion*100)) %>%
  select(sample,damr)

damr %>% 
  tt()
tinytable_b1l5bbhfwxewlle2633q
sample damr
EHI01966 100.00000
EHI01967 100.00000
EHI01968 100.00000
EHI01970 100.00000
EHI01979 100.00000
EHI01982 100.00000
EHI01985 100.00000
EHI01989 100.00000
EHI01992 100.00000
EHI01995 100.00000
EHI02065 100.00000
EHI02088 100.00000
EHI02582 100.00000
EHI02584 99.78431
EHI02585 100.00000
EHI02587 100.00000
EHI02592 100.00000
EHI02598 100.00000
EHI02602 100.00000
EHI02603 100.00000
EHI02607 100.00000
EHI02612 100.00000
EHI02615 100.00000
EHI02617 100.00000
EHI02619 100.00000
EHI02624 100.00000
EHI02625 100.00000
EHI02630 100.00000
EHI02632 100.00000
EHI02633 100.00000
EHI02639 100.00000
damr %>% 
  filter(damr>95) %>%
  nrow()
[1] 31
damr %>% 
  summarise(mean=mean(damr),sd=sd(damr)) %>% 
  tt()
tinytable_dga63uob37elgfnz84ww
mean sd
99.99304 0.03873935